Loading R libraries

The following R libraries have been used for the analysis.

knitr::opts_chunk$set(
    echo = TRUE,
    message = FALSE,
    warning = FALSE, 
    cache=TRUE, 
    autodep = TRUE, 
    screenshot.force = TRUE, 
    out.width="100%", 
    dpi=300,
    cache.lazy = FALSE
)
# Install CRAN packages
list.of.packages <- c(
  "data.table",
  "tidyverse",
  "lubridate",
  "plotly",
  "leaflet",
  "geosphere",
  "ggthemes",
  "ggcorrplot",
  "webshot",
  "caret",
  "xgboost",
  "car"
)

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
if (length(new.packages)) install.packages(new.packages, repos = "https://cran.rstudio.com/", dependencies = T)

suppressWarnings(suppressMessages(suppressPackageStartupMessages({
  library(data.table)
  library(tidyverse)
  library(lubridate)
  library(plotly)
  library(leaflet)
  library(geosphere)
  library(ggthemes)
  library(ggcorrplot)
  library(webshot)
  library(caret)
  library(xgboost)
  library(car)
})))

Preparing Data

Load Datasets

The zipped files were downloaded manually from the link provided and decompressed to extract the CSVs.
The CSVs are loaded in to memory using the fread function from the data.table package

trip_fare <- fread('../Data/trip_fare_4.csv')
trip_data <- fread('../Data/trip_data_4.csv')

Merge two datasets

Let’s merge the two datasets to create a single dataset containing all the information.
Since the data does not comes with a data dictionary, we will try to define the primary key for the two datasets. Ideally, a unique trip can be identified by taxi identifier, with a given pick-up datetime.

  • medallion - unique taxi permit
  • hack_license - unique taxi license number
  • vendor_id - Taxi service provider company
  • pickup_datetime - Adding in time component to uniquely identify a single trip

Based on the primary keys, we will identify if we have duplicates in the two datasets before merging them.

join_keycols <- names(trip_fare)[1:4]
setkeyv(trip_fare, join_keycols)
setkeyv(trip_data, join_keycols)

# Number of records in trip fare dataset
trip_fare[, .N]
#15100468

# Unique records in trip fare dataset by the primary key
uniqueN(trip_fare, by = key(trip_fare))
#15099816

# Number of records in trip data dataset
trip_data[,.N]
#15100468

# Unique records in trip data dataset by the primary key
uniqueN(trip_data, by = key(trip_data))
#15099816

Looking at the statistics above and based on the defined primary key to identify a unique trip, we do see some duplicates in both the datasets. We need to ensure the duplicates are removed and both datasets contains same trips before merging them.

trip_fare <- unique(trip_fare, by = key(trip_fare))
trip_data <- unique(trip_data, by = key(trip_data))

# Inner JOIN makes sure that we have only the common trips from both the datasets
trip_combined <- trip_fare[trip_data, nomatch = 0]
trip_combined[, .N]
#15099816

Sample dataset

Since the dataset is large and resources are limited, we will work with a sample of the data instead of the entire dataset. Here, we are taking a random sample of 10% of the entire dataset. Seed is set before taking the sample to make sure the results are reproducible everytime the code is re-run. We will save the sample dataset to a file so that we don’t have to repeat these steps everytime the code is re-run and instead read the sample dataset directly from disk.

set.seed(131)

# Randomly select 10% of the records
trip_combined[, sample_flg := sample(c(TRUE, FALSE), size = .N, replace = TRUE, prob = c(0.1, 0.9))]

# Subset the 10% of the records as a sampe dataset
trip_combined_sample <- trip_combined[sample_flg == T]

# Save the contents to disk
fwrite(trip_combined_sample, '../Data/trip_combined_sample.csv')

# Compress the csv
system(sprintf("gzip -f ../Data/trip_combined_sample.csv"))

Load the sample dataset

trip_combined_sample <- fread('../Data/trip_combined_sample.csv.gz')

Exploratory Data Analysis

Descriptive Statistics

Let’s look at the data type of the different variables we have in the dataset.

glimpse(trip_combined_sample)
## Observations: 1,508,586
## Variables: 22
## $ medallion          <chr> "00005007A9F30E289E760362F69E4EAD", "00005007…
## $ hack_license       <chr> "1E3F1640D7AE96FADBF4612DF04AC3E3", "39C3E34B…
## $ vendor_id          <chr> "CMT", "CMT", "CMT", "CMT", "CMT", "CMT", "CM…
## $ pickup_datetime    <chr> "2013-04-24 21:57:34", "2013-04-04 08:46:48",…
## $ payment_type       <chr> "CRD", "CRD", "CSH", "CSH", "CSH", "CRD", "CS…
## $ fare_amount        <dbl> 18.0, 7.0, 11.5, 9.0, 4.5, 13.0, 5.0, 6.5, 11…
## $ surcharge          <dbl> 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ mta_tax            <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, …
## $ tip_amount         <dbl> 3.80, 1.50, 0.00, 0.00, 0.00, 2.70, 0.00, 0.0…
## $ tolls_amount       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_amount       <dbl> 22.80, 9.00, 12.00, 9.50, 5.00, 16.20, 5.50, …
## $ rate_code          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ store_and_fwd_flag <chr> "N", "N", "Y", "Y", "N", "N", "N", "N", "Y", …
## $ dropoff_datetime   <chr> "2013-04-24 22:16:29", "2013-04-04 08:54:53",…
## $ passenger_count    <int> 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, …
## $ trip_time_in_secs  <int> 1135, 484, 1055, 667, 217, 1094, 241, 470, 92…
## $ trip_distance      <dbl> 5.0, 1.0, 1.5, 1.6, 0.6, 2.5, 0.6, 0.6, 2.0, …
## $ pickup_longitude   <dbl> -74.00161, -73.96178, -73.98432, -73.97349, -…
## $ pickup_latitude    <dbl> 40.73072, 40.76859, 40.76865, 40.75900, 40.75…
## $ dropoff_longitude  <dbl> -73.95048, -73.97217, -73.97810, -73.95551, -…
## $ dropoff_latitude   <dbl> 40.78067, 40.76031, 40.75504, 40.77824, 40.75…
## $ sample_flg         <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…

The date times are read as strings (character) format. Let’s convert date strings to date time format in R

trip_combined_sample[, pickup_datetime := as.POSIXct(pickup_datetime,format="%Y-%m-%d %H:%M:%OS")]
trip_combined_sample[, dropoff_datetime := as.POSIXct(dropoff_datetime,format="%Y-%m-%d %H:%M:%OS")]

Summary of data shows that the taxi trips are recorded for only a month, and is from April 2013.

summary(trip_combined_sample)
##   medallion         hack_license        vendor_id        
##  Length:1508586     Length:1508586     Length:1508586    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  pickup_datetime               payment_type        fare_amount    
##  Min.   :2013-04-01 00:00:00   Length:1508586     Min.   :  2.50  
##  1st Qu.:2013-04-08 16:36:59   Class :character   1st Qu.:  6.50  
##  Median :2013-04-16 00:57:00   Mode  :character   Median :  9.50  
##  Mean   :2013-04-16 01:53:27                      Mean   : 12.28  
##  3rd Qu.:2013-04-23 14:04:00                      3rd Qu.: 14.00  
##  Max.   :2013-04-30 23:59:58                      Max.   :500.00  
##                                                                   
##    surcharge         mta_tax         tip_amount       tolls_amount    
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.000   Min.   : 0.0000  
##  1st Qu.:0.0000   1st Qu.:0.5000   1st Qu.:  0.000   1st Qu.: 0.0000  
##  Median :0.0000   Median :0.5000   Median :  1.000   Median : 0.0000  
##  Mean   :0.3264   Mean   :0.4983   Mean   :  1.348   Mean   : 0.2445  
##  3rd Qu.:0.5000   3rd Qu.:0.5000   3rd Qu.:  2.000   3rd Qu.: 0.0000  
##  Max.   :3.0000   Max.   :0.5000   Max.   :160.000   Max.   :20.0000  
##                                                                       
##   total_amount     rate_code     store_and_fwd_flag
##  Min.   :  2.5   Min.   :0.000   Length:1508586    
##  1st Qu.:  8.0   1st Qu.:1.000   Class :character  
##  Median : 11.0   Median :1.000   Mode  :character  
##  Mean   : 14.7   Mean   :1.033                     
##  3rd Qu.: 16.5   3rd Qu.:1.000                     
##  Max.   :510.0   Max.   :6.000                     
##                                                    
##  dropoff_datetime              passenger_count trip_time_in_secs
##  Min.   :2013-04-01 00:01:00   Min.   :0.000   Min.   :    0.0  
##  1st Qu.:2013-04-08 16:49:21   1st Qu.:1.000   1st Qu.:  360.0  
##  Median :2013-04-16 01:08:00   Median :1.000   Median :  600.0  
##  Mean   :2013-04-16 02:05:58   Mean   :1.711   Mean   :  747.6  
##  3rd Qu.:2013-04-23 14:18:00   3rd Qu.:2.000   3rd Qu.:  960.0  
##  Max.   :2013-05-01 00:40:00   Max.   :9.000   Max.   :10800.0  
##                                                                 
##  trip_distance    pickup_longitude   pickup_latitude    dropoff_longitude
##  Min.   : 0.000   Min.   :-1351.09   Min.   :-3117.30   Min.   :-736.02  
##  1st Qu.: 1.050   1st Qu.:  -73.99   1st Qu.:   40.74   1st Qu.: -73.99  
##  Median : 1.790   Median :  -73.98   Median :   40.75   Median : -73.98  
##  Mean   : 2.867   Mean   :  -72.73   Mean   :   40.06   Mean   : -72.68  
##  3rd Qu.: 3.200   3rd Qu.:  -73.97   3rd Qu.:   40.77   3rd Qu.: -73.96  
##  Max.   :85.700   Max.   :   73.94   Max.   : 2316.22   Max.   : 100.99  
##                                                         NA's   :15       
##  dropoff_latitude   sample_flg    
##  Min.   :-3517.93   Mode:logical  
##  1st Qu.:   40.73   TRUE:1508586  
##  Median :   40.75                 
##  Mean   :   40.04                 
##  3rd Qu.:   40.77                 
##  Max.   : 1608.48                 
##  NA's   :15

Missing Data Analysis

The summary statistics shows that there are missing values only for two of the variables, dropoff_longitude and dropoff_latitude There are techniques to analyse missing values with missing at random or missing completely at random or missing not at random. And, there are techniques to treat the missing values like removing the entire record containing missing values or imputing the missing values with mean, median or some fixed values. But for the purpose of this analysis, we will simple remove them as the missing obersvations are a tiny proportion of the sample dataset.

# Remove the null values from drop-off lat and long
trip_combined_sample <- trip_combined_sample[!is.na(dropoff_longitude) & !is.na(dropoff_latitude)]

Outlier Analysis & Data Cleaning

From the summary statistics, looking at the range of the values of lat and long, it appears the data is either incorrect or the lat long are captured in a multiple formats.
Assuming lat and long are captured in degrees (decimal) format, we will remove any erroneous data that does not conforms with the degrees (decimal) format.
Since the latitudes range from -90 to 90 and longitudes range from -180 to 180, any data that is outside that range will be removed from the dataset.

# Lat long range (lat from -90 to 90 and long from -180 to 180)
trip_combined_sample <- trip_combined_sample[pickup_latitude %between% c(-90, 90) & pickup_longitude %between% c(-180, 180)]

trip_combined_sample <- trip_combined_sample[dropoff_latitude %between% c(-90, 90) & dropoff_longitude %between% c(-180, 180)]

The range of lat and long is now within the standard range but it is still too large considering the data is only from New York taxi services.

summary(trip_combined_sample$pickup_latitude)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -74.02   40.74   40.75   40.07   40.77   73.99
summary(trip_combined_sample$pickup_longitude)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -96.70  -73.99  -73.98  -72.73  -73.97   73.94
summary(trip_combined_sample$dropoff_latitude)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -74.01   40.73   40.75   40.05   40.77   74.02
summary(trip_combined_sample$dropoff_longitude)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -153.96  -73.99  -73.98  -72.68  -73.96  100.99

The histograms and box plots for the various lat and longs shows that most of the data points are concentatrated around a specific cordinates but some of the trips are spread all over the world.

hist(trip_combined_sample$pickup_latitude,
      main="Histogram for Pickup Latitude", 
      xlab="Pickup Latitude", 
      border="black", 
      col="lightblue")

ggplot(trip_combined_sample, aes(x="pickup_latitude", y=pickup_latitude)) +
  geom_boxplot() +
  labs(x = "", y = "")+
  ggtitle("Box Plot for Pickup Latitude") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "none") + 
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  scale_colour_tableau()

We know that NYC is in northern hemisphere. Some of the pickups are from equator (latitude = 0) and some are from southern hemisphere.
Clearly data is either wrongly captured or there were errors in capturing the data.

hist(trip_combined_sample$pickup_longitude,
     main="Histogram for Pickup Longitude", 
     xlab="Pickup Longitude", 
     border="black", 
     col="lightblue")

ggplot(trip_combined_sample, aes(x="pickup_longitude", y=pickup_longitude)) +
  geom_boxplot() +
  labs(x = "", y = "")+
  ggtitle("Box Plot for Pickup Longitude") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "none") + 
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  scale_colour_tableau()

hist(trip_combined_sample$dropoff_latitude,
     main="Histogram for Dropoff Latitude", 
     xlab="Dropoff Latitude", 
     border="black", 
     col="lightblue")

ggplot(trip_combined_sample, aes(x="dropoff_latitude", y=dropoff_latitude)) +
  geom_boxplot() +
  labs(x = "", y = "")+
  ggtitle("Box Plot for Dropoff Latitude") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "none") + 
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  scale_colour_tableau()

hist(trip_combined_sample$dropoff_longitude,
     main="Histogram for Dropoff Longitude", 
     xlab="Dropoff Longitude", 
     border="black", 
     col="lightblue")

ggplot(trip_combined_sample, aes(x="dropoff_longitude", y=dropoff_longitude)) +
  geom_boxplot() +
  labs(x = "", y = "")+
  ggtitle("Box Plot for Dropoff Longitude") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none") + 
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  scale_colour_tableau()

Since we are dealing with New York data, the lat and long can further be limited to New York and nearby. NYC coordinates are 40.7141667 lat and -74.0063889 long and if we restrict the pickups and drops within 1000 km radius, we should have a reliable dataset about the NYC taxi trips only

# NYC range within 1000 kms radius (1 degree equal appox 111 kms) and NYC coordinates are 40.7141667 lat and -74.0063889 long
trip_combined_sample <- trip_combined_sample[pickup_latitude %between% c(30, 50) & pickup_longitude %between% c(-84, -64)]

trip_combined_sample <- trip_combined_sample[dropoff_latitude %between% c(30, 50) & dropoff_longitude %between% c(-84, -64)]

#boxplot.stats(trip_combined_sample$pickup_longitude)

Let’s visualize the pick-up lat longs on the map. We can see that most of trips are concentrated around Manhattan borough (often referred as the city) and some of the pick-ups are from nearby airports.

leaflet(data = head(trip_combined_sample, 1000)) %>%
  addTiles()%>%
  #addProviderTiles("Stamen.TonerLite")%>%
  #addProviderTiles("Esri.NatGeoWorldMap") %>%
  addCircleMarkers(~ pickup_longitude, ~pickup_latitude, radius = 1,
                   color = "navy", fillOpacity = 0.3)

Some of the trips have a trip distance of 0 and some have trip duration of 0. For this analysis, we will drop them from our datasets

# Remove zero distance trips
trip_combined_sample <- trip_combined_sample[trip_distance > 0]

# Remove zero time trips
trip_combined_sample <- trip_combined_sample[trip_time_in_secs > 0]

Univariate Analysis

Distribution of Passenger Count

Most of trips are single passenger trips. We do have 0 passenger trips. Maybe, they are trips for some delivery and does not include any passengers. And, there are some trips with 9 passengers. We are not excluding any trips based on passenger count as all of these seems to be genuine trips and may contribute towards the fare amount predictions.

# Count trips group by passenger count
plot_passenger_dist <- trip_combined_sample[, .N, by = list(passenger_count)]

# Convert the passenger count to a categorical variable
plot_passenger_dist$passenger_count <- as.factor(plot_passenger_dist$passenger_count)

# Visualize
ggplot(plot_passenger_dist, aes(passenger_count, N, fill = passenger_count)) +
  geom_col() +
  scale_y_sqrt() +
  labs(x = "Passenger Count", y = "Number of Trips")+
  ggtitle("Histogram of Passenger Count") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none") + 
  scale_colour_tableau()

Distribution of Payment Type

Since there is no data dictionary for the abbreviated payment types. We are assuming CRD means a credit card transaction and CSH mean a cash transaction. UNK might be Unknown transactions. Most of the transactions are either CRD or CSH.

# Count trips group by payment type
plot_payment_type_dist <- trip_combined_sample[, .N, by = list(payment_type)]

# Convert the payment type to a categorical variable
plot_payment_type_dist$payment_type <- as.factor(plot_payment_type_dist$payment_type)

# Visualize
ggplot(plot_payment_type_dist, aes(payment_type, N, fill = payment_type)) +
  geom_col() +
  scale_y_sqrt() +
  labs(x = "Payment Type", y = "Number of Trips")+
  ggtitle("Histogram of Payment Type") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none") + 
  scale_colour_tableau()

Distribution of Fare Amount

Most of the trips are under $80 range. Some of the high fare amounts (>$100) needs to be explored further. So the fare amount should be a function of trip distance and trip duration. We will have a look if the high fares are justified by their corresponding trip distances and durations or if they are outliers.

ggplot(trip_combined_sample, aes(fare_amount)) +
  geom_histogram(binwidth = 20,
                 col="black", 
                 fill="light blue") +
  labs(x = "Fare Amount", y = "Number of Trips")+
  ggtitle("Histogram of Fare Amount") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "none") + 
  scale_colour_tableau()

Distribution of Tip Amount

Most of the tips are under $10 range. Some of the high tip amounts needs to be explored further along with fare amount to detect any outliers.

ggplot(trip_combined_sample, aes(tip_amount)) +
  geom_histogram(binwidth = 10,
                 col="black", 
                 fill="light blue") +
  labs(x = "Tip Amount", y = "Number of Trips")+
  ggtitle("Histogram of Tip Amount") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "none") + 
  scale_colour_tableau()

Distribution of Total Amount

Most of the total amounts are under $100 range and are along the lines of fare amount.

ggplot(trip_combined_sample, aes(total_amount)) +
  geom_histogram(binwidth = 20,
                 col="black", 
                 fill="light blue") +
  labs(x = "Total Amount", y = "Number of Trips")+
  ggtitle("Histogram of Total Amount") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "none") + 
  scale_colour_tableau()

Feature Engineering

Fare per unit distance

We know that fare amount comprises of multiple price components such as starting fare, fare per miles/kms, waiting fares, tolls, surcharges and other fees. The major component would be fare per miles/kms (for longer trips). Let’s try to compute a fare per unit distance and try to find any abnormalities in the data that can impact predicting the fare prices. We will try to remove any records that are outside 2 standard deviations.

trip_combined_sample[, fare_per_dist := (fare_amount/trip_distance)]

# Calculate 2 standard deviations
trip_combined_sample[,.(min_fare = min(fare_per_dist), 
                        max_fare = max(fare_per_dist),
                        sd_2_fare_upper = mean(fare_per_dist) + 2 * sd(fare_per_dist), 
                        sd_2_fare_lower = mean(fare_per_dist) - 2 * sd(fare_per_dist)
                        )
                     ]
##      min_fare max_fare sd_2_fare_upper sd_2_fare_lower
## 1: 0.03172589    10000        40.75357       -28.67321
# Visualize if the trip duration has an impact on the high fare_per_dist for lower trip distances

ggplot(head(trip_combined_sample, 10000)) + 
  geom_point(aes(x=trip_distance, y=fare_per_dist, col = (trip_time_in_secs/60))) + 
  labs(x = "Trip Distance", y = "Fare amount per unit distance") +
  labs(col = "Trip Time (in mins)") +
  ggtitle("Fare amount per unit distance Vs trip distance & time") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5))

Looking at the chart above, it seems that some of the high fare does not have that high trip durations. They seem to be outliers. Let’s remove the extreme tail ends of fare_per_dist (2 standard deviations)

trip_combined_sample <- trip_combined_sample[fare_per_dist < (mean(fare_per_dist) + 2 * sd(fare_per_dist))]

Day of week

To analyse the impact of time/day on the number of trips, let’s create new features/variables and see the impact of day of week on the volume of trips made.

trip_combined_sample[, pickup_wday := wday(pickup_datetime, label = TRUE)]

plot_wday_trips <- trip_combined_sample[, .N, by = list(pickup_wday, vendor_id)]

ggplot(plot_wday_trips, aes(pickup_wday, N, colour = vendor_id)) +
  geom_point(size = 4) +
  labs(x = "Day of the week", y = "Total number of pickups") +
  labs(col = "Vendor") +
  ggtitle("Taxi trips by day of the week") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5))

Looking at the chart above, it shows that Tuesdays were the busiest days for the month of April, 2013 followed by Mondays, Saturdays and Fridays.

Hour of the day

As the fares are dependent on the time of day (odd hours fares are higher than normal business hours fares), let’s compute the hour of the day feature/variable to see the impact of time of the day on the volumes of trips and average fare amount.

trip_combined_sample[, pickup_hour := hour(pickup_datetime)]

plot_hourly_trips <- trip_combined_sample[, .N, by = list(pickup_hour)]

ggplot(plot_hourly_trips, aes(reorder(pickup_hour, -N), N, fill = reorder(pickup_hour, -N))) +
  geom_col() +
  scale_y_sqrt() +
  labs(x = "Hour of the day", y = "Total number of pickups") +
  ggtitle("Taxi trips by hour of the day") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none")

From the chart, it is clear that the busiest hours are 7pm, 6pm, 8pm, 9pm and 10 pm. In short the evening times are the busiest from 6-10pm

plot_hourly_fares <- trip_combined_sample[, .(mean_hourly_fare = mean(fare_amount)), by = list(pickup_hour)]

ggplot(plot_hourly_fares, aes(reorder(pickup_hour, -mean_hourly_fare), mean_hourly_fare, fill = reorder(pickup_hour, -mean_hourly_fare))) +
  geom_col() +
  scale_y_sqrt() +
  labs(x = "Hour of the day", y = "Average Fare Amount") +
  ggtitle("Fare amount by hour of the day") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none")

Early morning fare averages are high peaking at 5am.

plot_hourly_trip_duration <- trip_combined_sample[, .(mean_hourly_trip_duration = mean(trip_time_in_secs)), by = list(pickup_hour)]

ggplot(plot_hourly_trip_duration, aes(reorder(pickup_hour, -mean_hourly_trip_duration), mean_hourly_trip_duration, fill = reorder(pickup_hour, -mean_hourly_trip_duration))) +
  geom_col() +
  scale_y_sqrt() +
  labs(x = "Hour of the day", y = "Average Trip Duration") +
  ggtitle("Trip Duration by hour of the day") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none")

Trip in the afternoons till evenings are longer duration trips, probably because of traffic.

Pickup distance from NYC centre

To see which at the busiest locations for taxi pickup and the impact of location on fare amount, let us compute a feature/variable which depicts the distance from city center. The hypothesis is that the locations closer to city center will be the busiest.

# New York City coordinates
nyc = c(-74.0063889, 40.7141667)

# Calculate distance from the city
trip_combined_sample$pickup_dist_from_city <- distm(trip_combined_sample[,list(pickup_longitude, pickup_latitude)], nyc, fun = distHaversine)

Converting distance in meters to kilometers

# Convert distance to km
trip_combined_sample[, pickup_dist_from_city := round(pickup_dist_from_city/1000, 2)]

summary(trip_combined_sample$pickup_dist_from_city)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    0.000    3.110    5.050    5.454    7.060 1206.340

The summary shows the maximum distance of a pickup from city center is 1200 km and is consistent with the ~1000km radius we applied to the sample dataset based on lat and long degrees. 75% of the pickups are within 7km from city centre.

trip_combined_sample[, km_bin := cut(trip_combined_sample$pickup_dist_from_city, 
                                     seq(0, max(pickup_dist_from_city) + 2, by = 2),  
                                     include.lowest = TRUE, 
                                     right = FALSE)]

plot_distance_from_city <- trip_combined_sample[pickup_dist_from_city <=30, .N, by = list(km_bin)]

ggplot(plot_distance_from_city, aes(reorder(km_bin, -N), N, fill = reorder(km_bin, -N))) +
  geom_col() +
  scale_y_sqrt() +
  labs(x = "Distance from city (km) 2 km bins", y = "Number of pickups") +
  ggtitle("Number of pickups by distance from city") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none")

The above chart shows top ten locations (in terms of radial distance from NYC center). The top 5 busiest locations are within 10km radius of the New York city center.

Trip with highest Standard Deviation

The 100% of the trip duration ranges from min to max (1, 10800) secs. The 99.7% of the trip duration range as per 3 standard deviation are (-895.3775, 2397.632). The sample dataset does not have any negative values and is positively right skewed. Hence, the trip with max trip duration of 10800 secs is the trip with the highest standard deviation

trip_combined_sample[ , .(min = min(trip_time_in_secs), max = max(trip_time_in_secs))]
##    min   max
## 1:   1 10800
trip_combined_sample[ , .(lower_bound = mean(trip_time_in_secs) - 3*sd(trip_time_in_secs), upper_bound = mean(trip_time_in_secs) + 3*sd(trip_time_in_secs))]
##    lower_bound upper_bound
## 1:   -895.3775    2397.632
ggplot(trip_combined_sample, aes(trip_time_in_secs, fill = "red", colour = 'red')) +
  geom_density(adjust = 5, alpha = 0.2) +
  labs(x = "Trip Duration (in secs)", y = "Density") +
  ggtitle("Density plot of Trip Duration") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none")

Standard Deviation of Fare Amount

Trip with the most consistent fare amount is the one with the least standard deviation which implies the trip with the fare amount closest to the mean fare amount.

trip_combined_sample[trip_combined_sample[, which.min(abs(fare_amount - mean(fare_amount)))]]
##                           medallion                     hack_license
## 1: 4233044C535D5D34B8E53D7C9AAB777F 41822E650532969D6DB37B46BEE5159E
##    vendor_id     pickup_datetime payment_type fare_amount surcharge
## 1:       CMT 2013-04-04 08:34:00          CSH        12.1         0
##    mta_tax tip_amount tolls_amount total_amount rate_code
## 1:     0.5          0            0         12.6         1
##    store_and_fwd_flag    dropoff_datetime passenger_count
## 1:                  N 2013-04-04 08:53:00               1
##    trip_time_in_secs trip_distance pickup_longitude pickup_latitude
## 1:              1156           3.1        -73.96526        40.79104
##    dropoff_longitude dropoff_latitude sample_flg fare_per_dist pickup_wday
## 1:         -73.97497         40.75634       TRUE      3.903226         Thu
##    pickup_hour pickup_dist_from_city km_bin
## 1:           8                  9.23 [8,10)

Multi-collinearity Analysis

Looking at the chart below, the fare amount has a fairly high linear relationship with trip distance and trip duration which is actually true as fare amount is directly proportional to the distance travelled and time taken by the trip.

trip_combined_sample_cormat <- trip_combined_sample[, 
                                                  list(rate_code,
                                                   pickup_datetime = as.numeric(pickup_datetime), 
                                                   dropoff_datetime = as.numeric(dropoff_datetime),
                                                   passenger_count, trip_time_in_secs, trip_distance, 
                                                   pickup_longitude, pickup_latitude, 
                                                   dropoff_longitude, dropoff_latitude, 
                                                   pickup_wday = as.numeric(trip_combined_sample$pickup_wday),
                                                   pickup_hour, pickup_dist_from_city, fare_amount)]


corr <- cor(trip_combined_sample_cormat, use = "pairwise.complete.obs")


ggcorrplot(corr, hc.order = FALSE, type = "lower",
           ggtheme = ggthemes::theme_gdocs,
           colors = c("#ff7f0e", "white", "#1f83b4"),
           lab = TRUE) +
            ggtitle("Correlation Matrix") +
            theme(panel.grid.major=element_blank(),
                  plot.title = element_text(hjust = 0.5)) 

Linear Regression

Let’s try to build a model for predicting the fare amount based on all the continous variables as shown in the above chart and also try to build a model using just the highly correlated independent variables trip_distance and trip_time_in_secs

Split data into train and test datasets

We will use 80% of the sample data to train the model and 20% of the data to test the model performance.

# Setting seed for reproducible results
set.seed(131)
training_samples <- createDataPartition(trip_combined_sample_cormat$fare_amount, p = 0.8, list = FALSE)
train_data  <- trip_combined_sample_cormat[training_samples, ]
test_data <- trip_combined_sample_cormat[-training_samples, ]

Train a multiple linear regression model

For the first model, we will use all the continous variables

# Train the model
model_1<-lm(fare_amount~., data = train_data)
model_summary <- summary(model_1)
model_summary
## 
## Call:
## lm(formula = fare_amount ~ ., data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.920   -0.342   -0.101    0.181  176.742 
## 
## Coefficients:
##                         Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)            1.591e+02  8.394e+00   18.950  < 2e-16 ***
## rate_code              6.652e+00  1.082e-02  614.827  < 2e-16 ***
## pickup_datetime        1.081e-04  2.803e-05    3.857 0.000115 ***
## dropoff_datetime      -1.081e-04  2.803e-05   -3.858 0.000114 ***
## passenger_count       -1.040e-02  1.506e-03   -6.909 4.87e-12 ***
## trip_time_in_secs      6.700e-03  2.862e-05  234.101  < 2e-16 ***
## trip_distance          1.642e+00  1.137e-03 1443.944  < 2e-16 ***
## pickup_longitude       4.013e-01  7.057e-02    5.686 1.30e-08 ***
## pickup_latitude        2.016e+00  6.646e-02   30.325  < 2e-16 ***
## dropoff_longitude      2.317e+00  5.703e-02   40.620  < 2e-16 ***
## dropoff_latitude      -7.273e-01  6.224e-02  -11.686  < 2e-16 ***
## pickup_wday           -1.102e-02  1.063e-03  -10.370  < 2e-16 ***
## pickup_hour           -1.208e-02  3.254e-04  -37.113  < 2e-16 ***
## pickup_dist_from_city -3.502e-03  6.034e-04   -5.804 6.50e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.27 on 1171794 degrees of freedom
## Multiple R-squared:  0.9437, Adjusted R-squared:  0.9437 
## F-statistic: 1.512e+06 on 13 and 1171794 DF,  p-value: < 2.2e-16

The model summary tells us that all the independent variables are significant in predicting the final value of the dependent variable (fare amount). An R-squared of 94.37% is a pretty good measure of the model performance. We need to ensure we are not overfitting the model and there are no effects of multi-collinearity among the independent variables that can impact the model performance.

Testing the model performance

We will use root mean square error (RMSE) and R-squared (R2) to measure and test the model performance.

RMSE is the mean error on the predictions and tell us how far the predicted value is from actual. For this metric, the lower the error, the better is the model perfomance.

R2 is the measure of proportion of the variability in the data that can be explained by the variations in the independent variables. Ideally, 100% of the variation in the dependent variable must be explained by the variations in the independent variables. So, for this metric, the higher values suggests a better performing model.

# Make predictions
predictions <-  predict(model_1, test_data)
# Model performance
data.table(
  RMSE = RMSE(predictions, test_data$fare_amount),
  R2 = R2(predictions, test_data$fare_amount)
)
##        RMSE        R2
## 1: 2.210935 0.9464949

To detect multicollinearity in a regression model, will test the Variance Inflation Factor (VIF) of each of the independent variables. The cut-off of VIF = 5 is used to determine the magnitude of multicollinearity and the variables that have VIF>5 are generally removed from the regression model.

vif(model_1)
##             rate_code       pickup_datetime      dropoff_datetime 
##          1.260701e+00          9.847355e+07          9.847408e+07 
##       passenger_count     trip_time_in_secs         trip_distance 
##          1.000551e+00          5.606676e+01          3.310206e+00 
##      pickup_longitude       pickup_latitude     dropoff_longitude 
##          1.844589e+00          1.267236e+00          1.197876e+00 
##      dropoff_latitude           pickup_wday           pickup_hour 
##          1.160223e+00          1.007685e+00          1.008004e+00 
## pickup_dist_from_city 
##          1.746623e+00

The VIF for two of the independent variables is quite high. pickup_datetime and dropoff_datetime
Let’s remove these variable and try the regression model.

# Train the model
model_2<-lm(fare_amount~. -pickup_datetime-dropoff_datetime, data = train_data)
model_summary <- summary(model_2)
model_summary
## 
## Call:
## lm(formula = fare_amount ~ . - pickup_datetime - dropoff_datetime, 
##     data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.911   -0.342   -0.101    0.181  176.747 
## 
## Coefficients:
##                         Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)            1.446e+02  7.467e+00   19.366  < 2e-16 ***
## rate_code              6.652e+00  1.082e-02  614.845  < 2e-16 ***
## passenger_count       -1.041e-02  1.506e-03   -6.915 4.70e-12 ***
## trip_time_in_secs      6.592e-03  5.999e-06 1098.724  < 2e-16 ***
## trip_distance          1.642e+00  1.137e-03 1444.053  < 2e-16 ***
## pickup_longitude       4.028e-01  7.057e-02    5.707 1.15e-08 ***
## pickup_latitude        2.016e+00  6.647e-02   30.327  < 2e-16 ***
## dropoff_longitude      2.317e+00  5.703e-02   40.635  < 2e-16 ***
## dropoff_latitude      -7.270e-01  6.224e-02  -11.681  < 2e-16 ***
## pickup_wday           -1.083e-02  1.062e-03  -10.196  < 2e-16 ***
## pickup_hour           -1.207e-02  3.253e-04  -37.109  < 2e-16 ***
## pickup_dist_from_city -3.488e-03  6.034e-04   -5.780 7.48e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.27 on 1171796 degrees of freedom
## Multiple R-squared:  0.9437, Adjusted R-squared:  0.9437 
## F-statistic: 1.787e+06 on 11 and 1171796 DF,  p-value: < 2.2e-16
# Make predictions
predictions <-  predict(model_2, test_data)
# Model performance
data.table(
  RMSE = RMSE(predictions, test_data$fare_amount),
  R2 = R2(predictions, test_data$fare_amount)
)
##        RMSE        R2
## 1: 2.210928 0.9464953
vif(model_2)
##             rate_code       passenger_count     trip_time_in_secs 
##              1.260680              1.000541              2.463868 
##         trip_distance      pickup_longitude       pickup_latitude 
##              3.309251              1.844557              1.267235 
##     dropoff_longitude      dropoff_latitude           pickup_wday 
##              1.197864              1.160221              1.006523 
##           pickup_hour pickup_dist_from_city 
##              1.007176              1.746588

Both the RMSE and R2 gets better by removing the two high VIF variables, but the change in model performance is not that significant. The VIF of model_2 improves. Hence, the model_2 is better than model_1

Let’s also look at predicting the fare amount solely based on trip distance and trip duration and see if the model performance improves or declines.

# Train the model
model_3<-lm(fare_amount~trip_distance+trip_time_in_secs, data = train_data)
model_summary <- summary(model_3)
model_summary
## 
## Call:
## lm(formula = fare_amount ~ trip_distance + trip_time_in_secs, 
##     data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.321   -0.356   -0.065    0.204  193.900 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.019e+00  4.147e-03   486.8   <2e-16 ***
## trip_distance     1.873e+00  1.114e-03  1680.8   <2e-16 ***
## trip_time_in_secs 6.319e-03  6.814e-06   927.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.613 on 1171805 degrees of freedom
## Multiple R-squared:  0.9254, Adjusted R-squared:  0.9254 
## F-statistic: 7.273e+06 on 2 and 1171805 DF,  p-value: < 2.2e-16
# Make predictions
predictions <-  predict(model_3, test_data)

# Model performance
data.table(
  RMSE = RMSE(predictions, test_data$fare_amount),
  R2 = R2(predictions, test_data$fare_amount)
)
##        RMSE        R2
## 1: 2.543644 0.9291734
vif(model_3)
##     trip_distance trip_time_in_secs 
##          2.398172          2.398172

The RMSE increases and R-squared decreases for the third model with just two independent variables, which makes the model_2 as the best model so far.

Machine Learning

xgboost model

Next we try to fit a machine learning model to our data and see if we can get a better performing model. We are going to try Extreme gradient boosting (xgboot) model for our regression problem. Here we are fitting a xgboost model with randomly chosen hyperparameters using the same training dataset that we used earlier in the linear model and test the model performance using the same metrics and same test dataset that we have used earlier.

xg_train <- xgb.DMatrix(data.matrix(train_data[, names(train_data)[1:12],with=FALSE])
                        , label = train_data$fare_amount
                        , missing = NaN
                        )

xg_test <- xgb.DMatrix(data.matrix(test_data[, names(test_data)[1:12],with=FALSE])
                       , label = test_data$fare_amount
                       , missing = NaN
                      )


# Hyperparameter has been randomly chosen but can be tuned using a gridsearch method
model_4 <- xgboost(data = xg_train, # training data as matrix
                          label = train_data$fare_amount,  # column of outcomes
                          nrounds = 100,       # number of trees to build
                          objective = 'reg:squarederror', # objective
                          eta = 0.3,
                          depth = 6,
                          verbose = 0  # silent
)

# Make predictions
pred <- predict(model_4, xg_test)

data.table(
  RMSE = RMSE(pred, test_data$fare_amount),
  R2 = R2(pred, test_data$fare_amount)
)
##        RMSE        R2
## 1: 1.038733 0.9881935

We can see that the machine learning model performs much better than linear regression with a lower RMSE and better R-squared

xgb.importance(names(train_data)[1:12], model = model_4)
##               Feature         Gain       Cover  Frequency
##  1:     trip_distance 7.607002e-01 0.275534885 0.19776407
##  2: trip_time_in_secs 1.921994e-01 0.199702106 0.19969160
##  3:         rate_code 3.789770e-02 0.084670402 0.12837317
##  4: dropoff_longitude 4.076790e-03 0.131179263 0.11661527
##  5:  pickup_longitude 1.759692e-03 0.064025081 0.07710100
##  6:  dropoff_latitude 1.620849e-03 0.113969676 0.08847340
##  7:   pickup_latitude 8.579963e-04 0.051549190 0.05705474
##  8:   pickup_datetime 4.253560e-04 0.008110852 0.06784888
##  9:       pickup_hour 2.868172e-04 0.041595773 0.03585197
## 10:       pickup_wday 7.111016e-05 0.013494399 0.01021588
## 11:   passenger_count 5.845308e-05 0.004684354 0.01021588
## 12:  dropoff_datetime 4.566125e-05 0.011484020 0.01079414

The feature importance shows the top two features are trip_distance and trip_time_in_secs

Mean values as central tendency

We have already established that our data has outliers and the distribution of fare amount and trip duration is positively right skewed. Since the distribution is not normal, the respective means for fare amount and trip duration cannot be treated as a measure of central tendency unless we try to make the distribution normal by removing the outliers or by transforming the variables (fare and distance).

Maximize earnings in a day

Assuming an 8-hour shift, let’s see which starting hour of the shift in a day gives maximun earnings

sum_earnings_hr <- trip_combined_sample[, .(fare_amount_sum = sum(fare_amount)), by = list(pickup_hour)][order(pickup_hour)]

# Append first 7 hours earnings at the end to handle the corner cases of rolling sum
sum_earnings_hr <- rbind(sum_earnings_hr, head(sum_earnings_hr, 7))

sum_earnings_hr[, shift_earnings := frollsum(fare_amount_sum, n = 8, align = 'left')]

# Drop the records that were appended for handling corner cases
sum_earnings_hr <- sum_earnings_hr[!is.na(shift_earnings)]

ggplot(sum_earnings_hr, aes(reorder(pickup_hour, -shift_earnings), shift_earnings, fill = reorder(pickup_hour, -shift_earnings))) +
  geom_col() +
  scale_y_sqrt() +
  labs(x = "Hour of the day", y = "Total earnings") +
  ggtitle("Total earnings for 8 hour shift by starting hour ") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none")

From the chart, it is clear that to maximise the earnings, it is better to start the 8-hour shift from 4pm (evenings)

Minimize work time in a day

Here the assumption is that work time is calculated by the time spent working on trips (trip durations).
Again assuming an 8-hour shift, let’s see which starting hour of the shift in a day gives more than average earnings by spending least amount of trip hours

sum_earnings_trips_hr <- trip_combined_sample[, .(fare_amount_sum = sum(fare_amount), trip_time_sum = sum(trip_time_in_secs)), by = list(pickup_hour)][order(pickup_hour)]

# Append first 7 hours earnings at the end to handle the corner cases of rolling sum
sum_earnings_trips_hr <- rbind(sum_earnings_trips_hr, head(sum_earnings_trips_hr, 7))

sum_earnings_trips_hr[, fare_amount_sum_shift := frollmean(fare_amount_sum, n = 8, align = 'left')]
sum_earnings_trips_hr[, trip_time_sum_shift := frollmean(trip_time_sum, n = 8, align = 'left')]

# Drop the records that were appended for handling corner cases
sum_earnings_trips_hr <- sum_earnings_trips_hr[!is.na(fare_amount_sum_shift)]

ggplot(sum_earnings_trips_hr[fare_amount_sum_shift>=mean(fare_amount_sum_shift)], 
       aes(reorder(pickup_hour, trip_time_sum_shift), trip_time_sum_shift, fill = reorder(pickup_hour, trip_time_sum_shift))) +
  geom_col() +
  scale_y_sqrt() +
  labs(x = "Hour of the day", y = "Total trip durations") +
  ggtitle("Total trip durations for 8 hour shift by starting hour") +
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none")

The chart shows if we start the 8-hour shift from 7pm (evenings), we will be spent the least time on taxi trips while still maintaining average earnings.

Maximising 10 taxis fleet

Based on the given data:

  • Assuming a percentage share in the fare, target the pickups in the evenings after 4pm
  • Considering the expenses on repair/replace of taxis, target shorter trips (time/distance) while still maintaing average fare income
  • Not considering the competiton, the target location would be within 10kms radius from NYC center
  • Newer taxis could be targetted for longer trip hotspots while older ones can be targetted for shorter trips
  • More complex analysis can be done the taxi ideal time/location compared to the demand time/location and redirecting taxis based on an optimisation algorithm